library(dplyr)
library(ggplot2)

# 5-8
library(readr)
library(tidyverse)
library(correlationfunnel)
library(DataExplorer)
library(WVPlots)
library(ggthemes)
library(ROCR)
library(caret)
library(corrplot)

library(gridExtra)

Q1 - Q4

Load the dataset as below:

data <- read.csv("KAG.csv", stringsAsFactors = FALSE)
str(data)
## 'data.frame':    1143 obs. of  15 variables:
##  $ X                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ad_id               : int  708746 708749 708771 708815 708818 708820 708889 708895 708953 708958 ...
##  $ campaign_id         : int  916 916 916 916 916 916 916 916 916 916 ...
##  $ age                 : int  32 32 32 32 32 32 32 32 32 32 ...
##  $ gender              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ interest            : int  15 16 20 28 28 29 15 16 27 28 ...
##  $ Impressions         : int  7350 17861 693 4259 4133 1915 15615 10951 2355 9502 ...
##  $ Clicks              : int  1 2 0 1 1 0 3 1 1 3 ...
##  $ Spent               : num  1.43 1.82 0 1.25 1.29 ...
##  $ Total_Conversion    : int  2 2 1 1 1 1 1 1 1 1 ...
##  $ Approved_Conversion : int  1 0 0 0 1 1 0 1 0 0 ...
##  $ CTR                 : num  0.0136 0.0112 0 0.0235 0.0242 0 0.0192 0.0091 0.0425 0.0316 ...
##  $ CPC                 : num  1.43 0.91 0 1.25 1.29 ...
##  $ CostPerConv_Total   : num  0.715 0.91 0 1.25 1.29 0 4.77 1.27 1.5 3.16 ...
##  $ CostPerConv_Approved: num  1.43 1.82 0 1.25 1.29 ...
colnames(data)
##  [1] "X"                    "ad_id"                "campaign_id"         
##  [4] "age"                  "gender"               "interest"            
##  [7] "Impressions"          "Clicks"               "Spent"               
## [10] "Total_Conversion"     "Approved_Conversion"  "CTR"                 
## [13] "CPC"                  "CostPerConv_Total"    "CostPerConv_Approved"
summary(data)
##        X              ad_id          campaign_id        age       
##  Min.   :   1.0   Min.   : 708746   Min.   : 916   Min.   :32.00  
##  1st Qu.: 286.5   1st Qu.: 777632   1st Qu.: 936   1st Qu.:32.00  
##  Median : 572.0   Median :1121185   Median :1178   Median :37.00  
##  Mean   : 572.0   Mean   : 987261   Mean   :1067   Mean   :38.32  
##  3rd Qu.: 857.5   3rd Qu.:1121804   3rd Qu.:1178   3rd Qu.:42.00  
##  Max.   :1143.0   Max.   :1314415   Max.   :1178   Max.   :47.00  
##      gender          interest       Impressions          Clicks      
##  Min.   :0.0000   Min.   :  2.00   Min.   :     87   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.: 16.00   1st Qu.:   6504   1st Qu.:  1.00  
##  Median :0.0000   Median : 25.00   Median :  51509   Median :  8.00  
##  Mean   :0.4821   Mean   : 32.77   Mean   : 186732   Mean   : 33.39  
##  3rd Qu.:1.0000   3rd Qu.: 31.00   3rd Qu.: 221769   3rd Qu.: 37.50  
##  Max.   :1.0000   Max.   :114.00   Max.   :3052003   Max.   :421.00  
##      Spent        Total_Conversion Approved_Conversion      CTR         
##  Min.   :  0.00   Min.   : 0.000   Min.   : 0.000      Min.   :0.00000  
##  1st Qu.:  1.48   1st Qu.: 1.000   1st Qu.: 0.000      1st Qu.:0.01005  
##  Median : 12.37   Median : 1.000   Median : 1.000      Median :0.01600  
##  Mean   : 51.36   Mean   : 2.856   Mean   : 0.944      Mean   :0.01642  
##  3rd Qu.: 60.02   3rd Qu.: 3.000   3rd Qu.: 1.000      3rd Qu.:0.02340  
##  Max.   :639.95   Max.   :60.000   Max.   :21.000      Max.   :0.10590  
##       CPC        CostPerConv_Total CostPerConv_Approved
##  Min.   :0.000   Min.   :  0.000   Min.   :  0.000     
##  1st Qu.:1.234   1st Qu.:  1.390   1st Qu.:  1.475     
##  Median :1.450   Median :  8.332   Median : 11.010     
##  Mean   :1.228   Mean   : 16.222   Mean   : 33.628     
##  3rd Qu.:1.600   3rd Qu.: 21.560   3rd Qu.: 41.675     
##  Max.   :2.212   Max.   :332.990   Max.   :541.700
Q.1 Which ad (provide ad_id as the answer) among the ads that have the least CPC led to the most impressions?
data %>%
  filter (CPC == min(CPC)) %>% 
  filter (Impressions == max(Impressions)) %>% 
  select(ad_id, CPC, Impressions)
##     ad_id CPC Impressions
## 1 1121094   0       24362

Answer: ad_id = 1121094

Q.2 What campaign (provide campaign_id as the answer) had spent least efficiently on brand awareness on an average(i.e. most Cost per mille or CPM: use total cost for the campaign / total impressions in thousands)?
data %>% 
  group_by(campaign_id) %>% 
  summarise_each(sum, Spent, Impressions) %>% 
  mutate(Efficiency = Spent/Impressions*1000) %>% 
  arrange(desc(Efficiency))
## # A tibble: 3 x 4
##   campaign_id  Spent Impressions Efficiency
##         <int>  <dbl>       <int>      <dbl>
## 1         936  2893.     8128187      0.356
## 2         916   150.      482925      0.310
## 3        1178 55662.   204823716      0.272

Answer: Campaign_id 936 had spent least efficiently on brand awareneww on an average

Q.3 Assume each conversion (‘Total_Conversion’) is worth $5, each approved conversion (‘Approved_Conversion’) is worth $50. ROAS (return on advertising spent) is revenue as a percentage of the advertising spent . Calculate ROAS and round it to two decimals.
Make a boxplot of the ROAS grouped by gender for interest = 15, 21, 101 (or interest_id = 15, 21, 101) in one graph. Also try to use the function ‘+ scale_y_log10()’ in ggplot to make the visualization look better (to do so, you just need to add ‘+ scale_y_log10()’ after your ggplot function). The x-axis label should be ‘Interest ID’ while the y-axis label should be ROAS.
df_3 <- data %>% 
  filter(interest %in% c(15,21,101), Spent !=0) %>% 
  mutate(ROAS = round((5*Total_Conversion + 50*Approved_Conversion)/Spent,2)) %>%
  select(interest, gender, ROAS) %>% 
  mutate(gender = as.factor(gender)) %>% 
  mutate(interest = as.factor(interest))


ggplot(data=df_3, mapping = aes(x=interest, y=ROAS, fill=gender))+
  geom_boxplot()+
  scale_y_log10()+
  xlab('Interest ID')+
  ylab('ROAS')+
  ggtitle('ROAS By Gender')+
  theme(plot.title = element_text(hjust = 0.5), legend.position = 'bottom')

Q.4 Summarize the median and mean of ROAS by genders when campaign_id == 1178.
data %>% 
  filter(campaign_id == 1178, Spent != 0) %>% 
  mutate(ROAS = round((5*Total_Conversion + 50*Approved_Conversion)/Spent,2)) %>%
  mutate(gender = as.factor(gender)) %>% 
  select(campaign_id, gender,ROAS) %>%
  group_by(gender) %>% 
  summarize(mean_ROAS = mean(ROAS), median_ROAS = median(ROAS))
## # A tibble: 2 x 3
##   gender mean_ROAS median_ROAS
##   <fct>      <dbl>       <dbl>
## 1 0           2.55        1.19
## 2 1           1.58        0.73
Q5-Q8

Load the advertising1 (Links to an external site.) dataset using readr.

df_ad = read_csv('advertising1.csv', col_names=TRUE, col_types='nnnnccfcTf')
str(df_ad)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of  10 variables:
##  $ Daily Time Spent on Site: num  69 80.2 69.5 74.1 68.4 ...
##  $ Age                     : num  35 31 26 29 35 23 33 48 30 20 ...
##  $ Area Income             : num  61834 68442 59786 54806 73890 ...
##  $ Daily Internet Usage    : num  256 194 236 246 226 ...
##  $ Ad Topic Line           : chr  "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
##  $ City                    : chr  "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
##  $ Male                    : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 2 2 ...
##  $ Country                 : chr  "Tunisia" "Nauru" "San Marino" "Italy" ...
##  $ Timestamp               : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
##  $ Clicked on Ad           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Daily Time Spent on Site` = col_number(),
##   ..   Age = col_number(),
##   ..   `Area Income` = col_number(),
##   ..   `Daily Internet Usage` = col_number(),
##   ..   `Ad Topic Line` = col_character(),
##   ..   City = col_character(),
##   ..   Male = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Country = col_character(),
##   ..   Timestamp = col_datetime(format = ""),
##   ..   `Clicked on Ad` = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
##   .. )
Q5
  1. We aim to explore the dataset so that we can better choose a model to implement. Plot histograms for at least 2 of the continuous variables in the dataset. Note it is acceptable to plot more than 2. [1 point]
# Histogram of continuous variables Age
p1 <- ggplot(data=df_ad, aes(x=Age))+
  geom_histogram(bins=20)+
  ggtitle('Histogram of Age')+
  theme(text = element_text(size=8))

# Histogram of continuous variable Area Income
p2 <- ggplot(data=df_ad, aes(x=`Area Income`))+
  geom_histogram(bins=20)+
  ggtitle('Histogram of Area Income')+
  theme(text = element_text(size=8))

# Histogram of continuous variables Daily Time Spent on Site
p3 <- ggplot(data=df_ad, aes(x=`Daily Time Spent on Site`))+
  geom_histogram(bins=20)+
  ggtitle('Histogram of Daily Time Spent on Site')+
  theme(text = element_text(size=8))

# Histogram of continuous variable Area Income
p4 <- ggplot(data=df_ad, aes(x=`Daily Internet Usage`))+
  geom_histogram(bins=20)+
  ggtitle('Histogram of Daily Internet Usage')+
  theme(text = element_text(size=8))


grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)

  1. Again on the track of exploring the dataset, plot at least 2 bar charts reflecting the counts of different values for different variables. Note it is acceptable to plot more than 2. [1 point]
p5 <- ggplot(data=df_ad, aes(x=Male))+
  geom_bar()+
  ggtitle('Bar Chart of Gender')

p6 <- ggplot(data=df_ad, aes(x=`Clicked on Ad`))+
  geom_bar()+
  ggtitle('Bar Chart of Clicked On Ad')


grid.arrange(p5, p6, nrow=1, ncol=2)

  1. Plot boxplots for Age, Area.Income, Daily.Internet.Usage and Daily.Time.Spent.on.Site separated by the variable Clicked.on.Ad. To clarify, we want to create 4 plots, each of which has 2 boxplots: 1 for people who clicked on the ad, one for those who didn’t. [2 points]
pc1 <- ggplot(data=df_ad, aes(y=Age, x=`Clicked on Ad`))+
  geom_boxplot()+
  ggtitle('Boxplot of Age')+
  theme(text = element_text(size=8))

pc2 <- ggplot(data=df_ad, aes(y=`Area Income`, x=`Clicked on Ad`))+
  geom_boxplot()+
  ggtitle('Boxplot of Area Income')+
  theme(text = element_text(size=8))

pc3 <- ggplot(data=df_ad, aes(y=`Daily Internet Usage`, x=`Clicked on Ad`))+
  geom_boxplot()+
  ggtitle('Boxplot of Daily Internet Usage')+
  theme(text = element_text(size=8))

pc4 <- ggplot(data=df_ad, aes(y=`Daily Time Spent on Site`, x=`Clicked on Ad`))+
  geom_boxplot()+
  ggtitle('Boxplot of Daily Time Spent on Site')+
  theme(text = element_text(size=8))


grid.arrange(pc1, pc2, pc3, pc4, nrow=2, ncol=2)

  1. Based on our preliminary boxplots, would you expect an older person to be more likely to click on the ad than someone younger?

Answer: Yes, from the box plot of Age by Clicked.on.Ad, older person is more likely to click on the ad than younger person.

Q6

Part (a) [3 points] 1. Make a scatter plot for Area.Income against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.

ggplot(df_ad, aes(x=Age, y=`Area Income`,shape=`Clicked on Ad`,color=`Clicked on Ad`))+
  geom_point()

  1. Based on this plot, would you expect a 31-year-old person with an Area income of $62,000 to click on the ad or not?

Answer: No, based on the plot that area is more likely has no click.

Part (b) [3 points]

  1. Similar to part a), create a scatter plot for Daily.Time.Spent.on.Site against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.
ggplot(df_ad, aes(x=Age, y=`Daily Time Spent on Site`,shape=`Clicked on Ad`,color=`Clicked on Ad`))+
  geom_point()

  1. Based on this plot, would you expect a 50-year-old person who spends 60 minutes daily on the site to click on the ad or not?

Answer: Yes, 50 year old and 60 mins is more likely to click.

Q7

Part (a) 1. Now that we have done some exploratory data analysis to get a better understanding of our raw data, we can begin to move towards designing a model to predict advert clicks. 2. Generate a correlation funnel (using the correlation funnel package) to see which of the variable in the dataset have the most correlation with having clicked the advert. NOTE: Here we are creating the correlation funnel in regards to HAVING clicked the advert, rather than not. This will lead to a minor distinction in your code between the 2 cases. However, it will not affect your results and subsequent variable selection.

names(df_ad) <- gsub(' ','.',names(df_ad))

df_ad %>%
  #select(-c(Ad.Topic.Line,City,Country,Timestamp)) %>%
  select(-Timestamp) %>% 
  mutate_if(is.numeric,as.numeric) %>%
  binarize() %>%
  correlate(Clicked.on.Ad__1) %>%
  plot_correlation_funnel(interactive = TRUE)

Part (b)

  1. Based on the generated correlation funnel, choose the 4 most covarying variables (with having clicked the advert) and run a logistic regression model for Clicked.on.Ad using these 4 variables.
  2. Output the summary of this model.
lm_a = glm(Clicked.on.Ad~Daily.Internet.Usage+Daily.Time.Spent.on.Site+Age+Area.Income,data=df_ad, family = 'binomial')
summary(lm_a)
## 
## Call:
## glm(formula = Clicked.on.Ad ~ Daily.Internet.Usage + Daily.Time.Spent.on.Site + 
##     Age + Area.Income, family = "binomial", data = df_ad)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4578  -0.1341  -0.0333   0.0167   3.1961  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               2.713e+01  2.714e+00   9.995  < 2e-16 ***
## Daily.Internet.Usage     -6.391e-02  6.745e-03  -9.475  < 2e-16 ***
## Daily.Time.Spent.on.Site -1.919e-01  2.066e-02  -9.291  < 2e-16 ***
## Age                       1.709e-01  2.568e-02   6.655 2.83e-11 ***
## Area.Income              -1.354e-04  1.868e-05  -7.247 4.25e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386.3  on 999  degrees of freedom
## Residual deviance:  182.9  on 995  degrees of freedom
## AIC: 192.9
## 
## Number of Fisher Scoring iterations: 8
Q8

Now using the caret package, create a confusion matrix for the model predictions and actual clicks. Note you do not need to graph or plot this confusion matrix. How many false-negative occurrences do you observe? Recall false negative means the instances where the model predicts the case to be false when in reality it is true. For this example, this refers to cases where the ad is clicked but the model predicts that it isn’t

df_ad$predict_prob <- predict(lm_a, df_ad, type='response')

df_predict <-df_ad %>% 
  mutate(predict_click = if_else(predict_prob>0.8,1,0)) %>% 
  mutate(predict_click = as.factor(predict_click))

confusionMatrix(df_predict$predict_click,df_predict$Clicked.on.Ad)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 497  36
##          1   3 464
##                                           
##                Accuracy : 0.961           
##                  95% CI : (0.9471, 0.9721)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.922           
##                                           
##  Mcnemar's Test P-Value : 2.99e-07        
##                                           
##             Sensitivity : 0.9940          
##             Specificity : 0.9280          
##          Pos Pred Value : 0.9325          
##          Neg Pred Value : 0.9936          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4970          
##    Detection Prevalence : 0.5330          
##       Balanced Accuracy : 0.9610          
##                                           
##        'Positive' Class : 0               
## 

Answer: The false negative is reference as 1 and prediction as 0. Based on the matrix, the false negative case is 36.